BME 499 - Final Project

David Chang and Mahmoud Komaiha
Due: April 24th, 2020

Task 1 - Processing Data

Reading in the table

cirrhosis = readtable('cirrhosis.csv');
% take out time temporarily, will add it back at end later
time = cirrhosis.time;
cirrhosis.time = [];
expandFigSize = [0 0 1000 700];

Add the extra variables

% Log some key variables
cirrhosis.logalb = log(cirrhosis.albumin);
cirrhosis.logbil = log(cirrhosis.bili);
cirrhosis.logpro = log(cirrhosis.protime);
% Rearrange data to put time as last variable
cirrhosis.time = time;

Store names and desc for later

% Make some variable descriptions
cirrhosis_desc = {'age','albumin','alkaline phosphatase','ascites','serum bilirubin','serum cholesterol', ...
'edema treatment','hepatomegaly','platelets','prothrombin time','sex','aspartate aminotransferase','spiders',...
'stage','censoring','treatment','trigycerides','urine copper','log albumin','log serum bilirubin',...
'log prothrombin time','time'};
% Label the Categorial Variables
categVar = {'ascites','edema','hepato','sex','spiders','stage','trt'};
% Subset key variables w/o log
subset = {'age','albumin','bili','alkphos','chol','ast','protime',...
'platelet','spiders','hepato','ascites','edema'};
% Subset key variables w/ generated log features
subset_log = {'age','albumin','bili','alkphos','chol','ast','protime','platelet',...
'spiders','hepato','ascites','edema','logalb','logbil','logpro'};
cirrhosis.Properties.VariableDescriptions = cirrhosis_desc;
cirrhosis_names = cirrhosis.Properties.VariableNames;
head(cirrhosis)
ans = 8×22 table
 agealbuminalkphosascitesbilicholedemahepatoplateletprotimesexastspidersstagestatustrttrigcopperlogalblogbillogprotime
158.76522.60001718114.50002611.0000119012.20001137.950014211721560.95552.67412.5014400
256.44634.14007.3948e+0301.10003020122110.60001113.5200130188541.42070.09532.36094500
370.07263.480051601.40001760.5000015112.0000096.10000421552101.24700.33652.48491012
454.74062.54006.1218e+0301.80002440.5000118310.3000160.6300142192640.93220.58782.33211925
538.10543.530067103.40002790113610.90001113.15001312721431.26131.22382.38881504
666.25873.980094400.800024801NaN11.0000193.0000032263501.3813-0.22312.39792503
755.53464.090082401.0000322012049.7000160.45000302213521.408502.27211832
853.05684.00004.6512e+0300.30002800037311.0000128.38000322189521.3863-1.20402.39792466

Task 2 - Exploratory Phase

Summarize data

cirr_summary = summary(cirrhosis);
N = numel(cirrhosis.Properties.VariableNames);
temp_table = struct('Min',zeros(N,1),'Max',zeros(N,1),'Range',zeros(N,1),...
'Median',zeros(N,1),'Mean',zeros(N,1),'NumMissing',zeros(N,1));
for i=1:N
temp_struct = cirr_summary.(cirrhosis_names{i});
temp_table.Min(i) = temp_struct.Min;
temp_table.Max(i) = temp_struct.Max;
temp_table.Mean(i) = mean(cirrhosis.(cirrhosis_names{i}));
temp_table.Median(i) = temp_struct.Median;
temp_table.Range(i) = temp_struct.Max - temp_struct.Min;
temp_table.NumMissing(i) = temp_struct.NumMissing;
end
sumTable = table(cirrhosis_names', ...
cirrhosis_desc', ...
temp_table.Min, ...
temp_table.Max, ...
temp_table.Mean, ...
temp_table.Median, ...
temp_table.Range, ...
temp_table.NumMissing, ...
'VariableNames',{'VarName','VarDesc','Min','Max','Mean','Median','Range','NumMissing'})
sumTable = 22×8 table
 VarNameVarDescMinMaxMeanMedianRangeNumMissing
1'age''age'26.277978.439450.019049.794752.16150
2'albumin''albumin'1.96004.64003.52003.55002.68000
3'alkphos''alkaline phosphatase'289.00001.3862e+041.9827e+0312591.3573e+040
4'ascites''ascites'010.0769010
5'bili''serum bilirubin'0.3000283.25611.350027.70000
6'chol''serum cholesterol'120.00001775NaN309.5000165528
7'edema''edema treatment'010.1106010
8'hepato''hepatomegaly'010.5128110
9'platelet''platelets'62.0000563NaN2575014
10'protime''prothrombin time'9.000017.100010.725610.60008.10000
11'sex''sex'010.8846110
12'ast''aspartate aminotransferase'26.3500457.2500122.5563114.7000430.90000
13'spiders''spiders'010.2885010
14'stage''stage'1.000043.0321330
15'status''censoring'020.8622020
16'trt''treatment'1.000021.4936110
17'trig''trigycerides'33.0000598NaN10856530
18'copper''urine copper'4.0000588NaN735842
19'logalb''log albumin'0.67291.53471.25081.26690.86180
20'logbil''log serum bilirubin'-1.20403.33220.57570.29944.53620
21'logpro''log prothrombin time'2.19722.83912.36862.36090.64190
22'time''time'41.000045562.0064e+031.8395e+0345150

Clean up and setup data for unupervised and supervised learning

Time and status are the two dependent variables. This cleanup is done to make separate datasets so that the two dependent variables are not in the same dataset.
cirrhosis = original dataset with missing values removed
cirrhosis_log = log(time) and status is removed - for linear regression, lasso, stepwise and RF_regression models
cirrhosis_status = time is removed - for PCA, k-means clustering, classification, so logistic regression and RF_classification models
nMissingRows = sum(any(ismissing(cirrhosis),2))
nMissingRows = 36
cirrhosis = rmmissing(cirrhosis);
cirrhosis_log = cirrhosis;
cirrhosis_log.time = log(cirrhosis_log.time);
cirrhosis_log = removevars(cirrhosis_log,{'status'});
cirrhosis_log_names = cirrhosis_log.Properties.VariableNames;
cirrhosis_status = cirrhosis(:,1:end-1);
status = cirrhosis_status.status;
cirrhosis_status.status = [];
cirrhosis_status.status = status > 1;
cirrhosis_status_names = cirrhosis_status.Properties.VariableNames;

Standardize Data with Z score

Toggle on and off with comments
Rerun and save different version of output when doing this
% status = cirrhosis_status.status;
% cirrhosis = zscore(table2array(cirrhosis));
% cirrhosis = array2table(cirrhosis,"VariableNames",cirrhosis_names);
% cirrhosis_log = zscore(table2array(cirrhosis_log));
% cirrhosis_log = array2table(cirrhosis_log,"VariableNames",cirrhosis_log_names);
% array2table(cirrhosis_log,"VariableNames",cirrhosis_log_names);
% cirrhosis_status = zscore(table2array(cirrhosis_status));
% cirrhosis_status = array2table(cirrhosis_status,"VariableNames",cirrhosis_status_names);
% cirrhosis_status.status = status;

Feature Histograms

f = figure();
set(f, 'Position', expandFigSize)
for i=1:size(cirrhosis,2)
subplot(5,5,i)
histogram(cirrhosis.(i))
title(cirrhosis_names{i})
end
sgtitle("Feature Histogram")

Boxplot

% Risk score computation as described in Cirrhosis paper
mayoFinalR = 0.871*cirrhosis.logbil ...
- 2.53*cirrhosis.logalb ...
+ 0.039*cirrhosis.age ...
+ 2.38*cirrhosis.logpro ...
+ 0.859*cirrhosis.edema;
% Scatterplot subset by risk high risk -> decreased survivability
f = figure();
set(f, 'Position', expandFigSize)
for i=1:size(cirrhosis,2)
subplot(5,5,i)
boxplot(cirrhosis.(i),mayoFinalR >= median(mayoFinalR),'Labels',{'Low Risk','High Risk'})
title(cirrhosis_names{i})
end
sgtitle("Feature Boxplot Grouped by Risk Type")

Scatter time vs. feature

f = figure();
set(f, 'Position', expandFigSize)
for i=1:size(cirrhosis,2)-1
subplot(5,5,i)
scatter(cirrhosis.(i),cirrhosis.time)
title(cirrhosis_names{i})
end
sgtitle("Scatter plot of Time vs Feature")

Scatter log time vs. feature

f = figure();
set(f, 'Position', expandFigSize)
for i=1:size(cirrhosis_log,2)-1
subplot(5,5,i)
scatter(cirrhosis_log.(i),cirrhosis_log.time)
title(cirrhosis_log_names{i})
end
sgtitle("Scatter plot of log(Time) vs Feature")

Task 3 - Statistical Analysis

Pairwise spearman rank correlations

figure();
[R_rank,P_rank] = corr(table2array(cirrhosis),'Type','Spearman','rows','complete');
r_rank_map = heatmap(R_rank);
title('Pairwise Rank Correlations')
r_rank_map.XDisplayLabels = cirrhosis_names;
r_rank_map.YDisplayLabels = cirrhosis_names;
% Features sorted by Correlation
absR_rank_time = abs(R_rank(1:end-1,end));
[absR_rank_time,I] = sort(absR_rank_time,'descend');
P_rank_time = P_rank(1:end-1,end);
cirrhosis_names_table = cirrhosis_names(1:end-1);
cirrhosis_names_table = cirrhosis_names_table(I);
rankTable = table(cirrhosis_names_table',absR_rank_time,P_rank_time(I),...
'VariableNames',{'Feature','Correlation','pValue'})
rankTable = 21×3 table
 FeatureCorrelationpValue
1'bili'0.49820.0000
2'logbil'0.49820.0000
3'copper'0.43590.0000
4'status'0.40590.0000
5'albumin'0.39100.0000
6'logalb'0.39100.0000
7'stage'0.37160.0000
8'edema'0.33750.0000
9'ascites'0.33370.0000
10'hepato'0.26660.0000
11'spiders'0.25670.0000
12'ast'0.25040.0000
13'trig'0.19160.0014
14'platelet'0.16220.0069
15'age'0.14320.0173
16'protime'0.13660.0232
17'logpro'0.13660.0232
18'chol'0.08680.1504
19'alkphos'0.06260.2999
20'sex'0.05600.3537
21'trt'0.01330.8262

Pairwise pearson correlations

figure();
[R_pear,P_pear] = corr(table2array(cirrhosis),'Type','Pearson','rows','complete');
r_pear_map = heatmap(R_pear);
title('Pairwise Pearson Correlations')
r_pear_map.XDisplayLabels = cirrhosis_names;
r_pear_map.YDisplayLabels = cirrhosis_names;
% Features sorted by Correlation
absR_pear_time = abs(R_pear(1:end-1,end));
[absR_pear_time,I] = sort(absR_pear_time,'descend');
P_rank_time = P_pear(1:end-1,end);
cirrhosis_names_table = cirrhosis_names(1:end-1);
cirrhosis_names_table = cirrhosis_names_table(I);
pearsonTable = table(cirrhosis_names_table',absR_pear_time,P_rank_time(I),...
'VariableNames',{'Feature','Correlation','pValue'})
pearsonTable = 21×3 table
 FeatureCorrelationpValue
1'logbil'0.48820.0000
2'bili'0.43030.0000
3'logalb'0.40260.0000
4'albumin'0.40190.0000
5'status'0.38450.0000
6'stage'0.36390.0000
7'copper'0.36140.0000
8'edema'0.34710.0000
9'ascites'0.32940.0000
10'hepato'0.26100.0000
11'spiders'0.26060.0000
12'ast'0.19110.0014
13'trig'0.16390.0064
14'platelet'0.15910.0081
15'age'0.14320.0173
16'chol'0.13670.0231
17'protime'0.12840.0330
18'logpro'0.12750.0343
19'alkphos'0.10440.0833
20'sex'0.02940.6269
21'trt'0.01930.7492

Pairwise pearson correlations with log time

figure();
[R_pear_log,P_pear_log] = corr(table2array(cirrhosis_log),'Type','Pearson','rows','complete');
r_pear_log_map = heatmap(R_pear_log);
title('Pairwise Pearson Correlations (Log Time)')
r_pear_log_map.XDisplayLabels = cirrhosis_log_names;
r_pear_log_map.YDisplayLabels = cirrhosis_log_names;
absR_pear_log_time = abs(R_pear_log(1:end-1,end));
[absR_pear_log_time,I] = sort(absR_pear_log_time,'descend');
P_rank_time = P_pear_log(1:end-1,end);
cirrhosis_names_table = cirrhosis_log_names(1:end-1);
cirrhosis_names_table = cirrhosis_names_table(I);
pearsonLogTable = table(cirrhosis_names_table',absR_pear_log_time,P_rank_time(I),...
'VariableNames',{'Feature','Correlation','pValue'})
pearsonLogTable = 20×3 table
 FeatureCorrelationpValue
1'edema'0.54020.0000
2'logbil'0.52420.0000
3'ascites'0.51330.0000
4'bili'0.50250.0000
5'logalb'0.46000.0000
6'albumin'0.44710.0000
7'copper'0.40190.0000
8'stage'0.38150.0000
9'logpro'0.31640.0000
10'protime'0.31500.0000
11'spiders'0.29340.0000
12'ast'0.24020.0001
13'platelet'0.22790.0001
14'hepato'0.22210.0002
15'age'0.22040.0002
16'trig'0.16420.0063
17'alkphos'0.04860.4214
18'chol'0.04310.4759
19'sex'0.03840.5249
20'trt'0.00950.8748

Task 4 - Unsupervised Learning

We did PCA and K-means clustering. We evaluated how the data groups with respect to the dependent categorical variable status.

PCA

% Remove status and try to find pca that detemines best features to predict status
cirrhosis_status_rm = cirrhosis_status;
cirrhosis_status_rm.status = [];
[coef, score, ~, ~, explained] = pca(zscore(table2array(cirrhosis_status_rm)));
% [coef, score, ~, ~, explained] = pca(table2array(cirrhosis_status_rm)); %Use this instead if you are working zscore data
% %Variance explained by PC1 and PC2
explained(1:2)
ans = 2×1
26.0297 11.6807
% Plot PCA and color by status
figure();
gscatter(score(:,1), score(:,2), cirrhosis_status.status)
xlabel('Principal Component 1')
ylabel('Principal Component 2')
title('PCA of Cirrhosis Patient data')
PC1_coeff = coef(:,1);
PC2_coeff = coef(:,2);
% Table of PCA Coefficients
T = table(cirrhosis_status_rm.Properties.VariableNames', PC1_coeff, PC2_coeff)
T = 20×3 table
 Var1PC1_coeffPC2_coeff
1'age'0.1256-0.2995
2'albumin'-0.28560.1006
3'alkphos'0.09000.1759
4'ascites'0.2788-0.1801
5'bili'0.32980.2530
6'chol'0.09630.4533
7'edema'0.2865-0.2080
8'hepato'0.21160.0657
9'platelet'-0.13080.2553
10'protime'0.2544-0.2676
11'sex'-0.03290.0521
12'ast'0.18020.3177
13'spiders'0.1987-0.0039
14'stage'0.2399-0.0663
15'trt'0.00600.0729
16'trig'0.15790.2934
17'copper'0.25310.1503
18'logalb'-0.28950.1048
19'logbil'0.34590.2797
20'logpro'0.2589-0.2677
% Table of PCA sorted by PC1
T_PC1 = sortrows(T,'PC1_coeff','descend')
T_PC1 = 20×3 table
 Var1PC1_coeffPC2_coeff
1'logbil'0.34590.2797
2'bili'0.32980.2530
3'edema'0.2865-0.2080
4'ascites'0.2788-0.1801
5'logpro'0.2589-0.2677
6'protime'0.2544-0.2676
7'copper'0.25310.1503
8'stage'0.2399-0.0663
9'hepato'0.21160.0657
10'spiders'0.1987-0.0039
11'ast'0.18020.3177
12'trig'0.15790.2934
13'age'0.1256-0.2995
14'chol'0.09630.4533
15'alkphos'0.09000.1759
16'trt'0.00600.0729
17'sex'-0.03290.0521
18'platelet'-0.13080.2553
19'albumin'-0.28560.1006
20'logalb'-0.28950.1048
% Table of PCA sorted by PC2
T_PC2 = sortrows(T,'PC2_coeff','descend')
T_PC2 = 20×3 table
 Var1PC1_coeffPC2_coeff
1'chol'0.09630.4533
2'ast'0.18020.3177
3'trig'0.15790.2934
4'logbil'0.34590.2797
5'platelet'-0.13080.2553
6'bili'0.32980.2530
7'alkphos'0.09000.1759
8'copper'0.25310.1503
9'logalb'-0.28950.1048
10'albumin'-0.28560.1006
11'trt'0.00600.0729
12'hepato'0.21160.0657
13'sex'-0.03290.0521
14'spiders'0.1987-0.0039
15'stage'0.2399-0.0663
16'ascites'0.2788-0.1801
17'edema'0.2865-0.2080
18'protime'0.2544-0.2676
19'logpro'0.2589-0.2677
20'age'0.1256-0.2995
%Box plots exploring top 5 features recognized as having most weight from from PC1
%Features split on status
f = figure();
set(f, 'Position', expandFigSize)
sgtitle('Most important features identifed by PC1 split by survival status')
features = T_PC1.Var1(1:5);
features_analyzed = cell(5,2);
features_analyzed(:,1) = features;
for i=1:length(features)
subplot(2,3,i)
boxplot(cirrhosis_status.(features{i}),cirrhosis_status.status, 'Labels',{'Survived','Died'})
title(features{i})
[h,p] = ttest2(cirrhosis_status.(features{i})(cirrhosis_status.status == 0),cirrhosis_status.(features{i})(cirrhosis_status.status == 1));
features_analyzed{i,2} = p;
end
T_PC1_analyzed = cell2table(features_analyzed);
T_PC1_analyzed.Properties.VariableNames = {'Features','P values'}
T_PC1_analyzed = 5×2 table
 FeaturesP values
1'logbil'7.4316e-18
2'bili'3.5816e-13
3'edema'1.5362e-08
4'ascites'3.0548e-07
5'logpro'3.8126e-12

K-Means Clustering

figure();
kValues = 2:10;
n = length(kValues);
s_score = zeros(n,1);
for i = 1:n
idx = kmeans(table2array(cirrhosis_status),kValues(i)); % k-means clustering
s = silhouette(table2array(cirrhosis_status),idx); % silhouette values
s_score(i) = mean(s); % silhouette score
end
table(kValues',s_score)
ans = 9×2 table
 Var1s_score
120.9274
230.8810
340.7412
450.7236
560.6846
670.6478
780.6429
890.5772
9100.5978
k = find(s_score==max(s_score))+1 % k = 2 is best
k = 2
% Plot silhouette for optimal cluster size
idx = kmeans(table2array(cirrhosis_status),k);
silhouette(table2array(cirrhosis_status),idx)
title('Silhouette Plot k=2')
%Predicting status for each cluster from k-means
cluster1 = (cirrhosis_status.status(idx == 1));
cluster1_survived = sum((cluster1 == 0))
cluster1_survived = 7
cluster1_died = sum((cluster1 == 1))
cluster1_died = 17
cluster2 = (cirrhosis_status.status(idx == 2));
cluster2_survived = sum((cluster2 == 0))
cluster2_survived = 158
cluster2_died = sum((cluster2 == 1))
cluster2_died = 94

Task 5 - Supervised Learning on Time

Linear regression using all the data and nonlog values of time

b_lin = fitlm(cirrhosis)
b_lin =
Linear regression model: time ~ 1 + age + albumin + alkphos + ascites + bili + chol + edema + hepato + platelet + protime + sex + ast + spiders + stage + status + trt + trig + copper + logalb + logbil + logpro Estimated Coefficients: Estimate SE tStat pValue _________ ________ __________ __________ (Intercept) -26619 10653 -2.4988 0.013093 age 1.4508 5.8132 0.24956 0.80313 albumin 1083.3 1282.2 0.84487 0.39898 alkphos 0.11516 0.027248 4.2264 3.3141e-05 ascites -180.94 296.96 -0.6093 0.54287 bili -24.179 25.017 -0.96651 0.33471 chol -0.075279 0.28404 -0.26503 0.7912 edema -368.75 280.94 -1.3126 0.19051 hepato -7.9454 128.19 -0.06198 0.95063 platelet 0.029573 0.63797 0.046354 0.96306 protime -1366.2 631.37 -2.1638 0.031411 sex -1.3879 185.43 -0.0074847 0.99403 ast 1.427 1.159 1.2312 0.2194 spiders -82.574 134.67 -0.61314 0.54033 stage -183.65 79.029 -2.3238 0.020924 status -264.43 72.511 -3.6468 0.00032215 trt 29.459 109.84 0.26819 0.78877 trig 1.1024 0.96721 1.1398 0.25544 copper -1.4629 0.7702 -1.8994 0.05865 logalb -1787.2 4290.2 -0.41658 0.67733 logbil -214.87 123.76 -1.7362 0.083737 logpro 17841 7293.8 2.4461 0.01512 Number of observations: 276, Error degrees of freedom: 254 Root Mean Squared Error: 872 R-squared: 0.433, Adjusted R-Squared: 0.386 F-statistic vs. constant model: 9.24, p-value = 2.5e-21
% Show the significant coefficients
b_lin.Coefficients(b_lin.Coefficients.pValue < 0.05,:)
ans = 6×4 table
 EstimateSEtStatpValue
1 (Intercept)-2.6619e+041.0653e+04-2.49880.0131
2 alkphos0.11520.02724.22640.0000
3 protime-1.3662e+03631.3702-2.16380.0314
4 stage-183.649279.0288-2.32380.0209
5 status-264.431772.5114-3.64680.0003
6 logpro1.7841e+047.2938e+032.44610.0151

Lasso, Stepwise, and Linear Regression

Used cirrhosis_log because we used log time for all regression models
% Top 10 features for linear regression
[B,I] = maxk(abs(R_pear_log(end,1:end-1)),10);
linear_reg_names = [{'(Intercept)'},cirrhosis_log_names(I)];
% Generate crossvalidation groups
NFolds = 5;
indices = crossvalind('kfold',size(cirrhosis_log,1),NFolds);
% Declare datasets for storing the outputs
regressionModel = struct();
subsets = {subset, subset_log};
% Run the analysis
for i=1:NFolds
test = (indices == i);
train = ~test;
% Extract train rows from X and Y
trainData = cirrhosis_log(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_log(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = table2array(testData(:,end));
% Lasso
% Make model and predict
[lassoB,lassoFit] = lasso(Xtrain,Ytrain,'CV',NFolds);
coef0 = lassoFit.Intercept(lassoFit.IndexMinMSE);
coef = lassoB(:,lassoFit.IndexMinMSE);
Ypred = Xtest*coef + coef0;
% Get best coef
ncoef = sum(coef ~= 0);
[~,maxCoefIdx] = maxk(abs(coef),ncoef);
% Save fit for later and evaluate the model
regressionModel.lasso.fit{i} = lassoFit;
regressionModel.lasso.coeffVals{i} = coef(maxCoefIdx);
regressionModel.lasso.coeffNames{i} = cirrhosis_names(maxCoefIdx)';
regressionModel.lasso.ncoef(i) = ncoef;
regressionModel.lasso.absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.lasso.corrR(i),regressionModel.lasso.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.lasso.rankCorrR(i),regressionModel.lasso.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
% Stepwise fit
for j=1:3
if j == 1
stepFit = stepwiselm(trainData,'Upper','linear','ResponseVar','time','CategoricalVars',categVar,'Verbose',0);
else
stepFit = stepwiselm(trainData,'Upper','linear','ResponseVar','time','PredictorVars',subsets{j-1},'Verbose',0);
end
Ypred = stepFit.predict(Xtest);
[~,coefIdx] = sort(stepFit.Coefficients.pValue);
% Save fit for later and evaluate the model
regressionModel.step(j).fit{i} = stepFit;
regressionModel.step(j).coeffNames{i} = stepFit.Coefficients.Row(coefIdx);
regressionModel.step(j).coeffVals{i} = stepFit.Coefficients.Estimate(coefIdx);
regressionModel.step(j).ncoef(i) = stepFit.NumPredictors;
regressionModel.step(j).rSquared(i) = stepFit.Rsquared.Ordinary;
regressionModel.step(j).rSquaredAdj(i) = stepFit.Rsquared.Adjusted;
regressionModel.step(j).absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.step(j).corrR(i),regressionModel.step(j).corrP(i)] = corr(Ytest,Ypred);
[regressionModel.step(j).rankCorrR(i),regressionModel.step(j).rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
end
% Linear
% Make model and predict
mdl = fitlm(Xtrain(:,I),Ytrain);
Ypred = mdl.predict(Xtest(:,I));
[~,coefIdx] = sort(mdl.Coefficients.pValue);
% Evaluate the model
regressionModel.linear.fit{i} = mdl;
regressionModel.linear.features{i} = linear_reg_names;
regressionModel.linear.coeffNames{i} = linear_reg_names(coefIdx)';
regressionModel.linear.coeffVals{i} = mdl.Coefficients.Estimate(coefIdx);
regressionModel.linear.rSquared(i) = mdl.Rsquared.Ordinary;
regressionModel.linear.rSquaredAdj(i) = mdl.Rsquared.Adjusted;
regressionModel.linear.ncoef(i) = length(B);
regressionModel.linear.absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.linear.corrR(i),regressionModel.linear.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.linear.rankCorrR(i),regressionModel.linear.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
end

Lasso Analysis

rowNames = cellfun(@(s)sprintf('Subsample %d',s),num2cell(transpose(1:NFolds)),'UniformOutput',false);
varNames = ["ncoeff","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
table(regressionModel.lasso.ncoef',regressionModel.lasso.absError',regressionModel.lasso.corrR',...
regressionModel.lasso.corrP',regressionModel.lasso.rankCorrR',regressionModel.lasso.rankCorrP',...
'VariableNames', varNames,'RowNames', rowNames)
ans = 5×6 table
 ncoeffAverage Absolute ErrorPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-val
1 Subsample 190.44320.70361.4711e-090.53472.1819e-05
2 Subsample 280.53500.74675.9488e-110.63693.7350e-07
3 Subsample 3100.39870.68567.6779e-090.60072.1068e-06
4 Subsample 4160.53240.60559.6915e-070.54582.2436e-05
5 Subsample 5140.43320.49381.2742e-040.45665.2559e-04
maxRow = max(regressionModel.lasso.ncoef);
T = table();
for i=1:NFolds
tempT = table([regressionModel.lasso.coeffNames{i},num2cell(regressionModel.lasso.coeffVals{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T2Pad = [tempT;repmat({[{NaN},{NaN}]},maxRow-size(tempT,1),1)];
T = [T T2Pad];
end
T
T = 16×5 table
 Subset 1Subset 2Subset 3Subset 4Subset 5
1'copper'1.0398'edema'-0.8454'copper'0.8147'copper'1.2322'copper'0.8193
2'ascites'-0.5812'copper'0.5156'edema'-0.6898'ascites'-0.4140'edema'-0.5482
3'edema'-0.4742'ascites'-0.4198'ascites'-0.5355'edema'-0.3415'ascites'-0.3432
4'logalb'-0.1906'logalb'-0.1689'logalb'-0.1439'logalb'-0.1007'albumin'0.1643
5'stage'-0.0950'stage'-0.0834'stage'-0.0620'stage'-0.0899'logalb'-0.1306
6'spiders'-0.0741'bili'-6.2160e-04'spiders'-0.0194'hepato'0.0874'stage'-0.0567
7'bili'-0.0018'trig'-3.7997e-04'bili'-0.0084'spiders'-0.0740'spiders'-0.0367
8'trig'-7.2035e-04'alkphos'3.4316e-05'age'-0.0021'status'-0.0302'bili'-0.0274
9'alkphos'3.5227e-05NaNNaN'trig'-7.9512e-04'bili'-0.0288'status'-0.0101
10NaNNaNNaNNaN'alkphos'3.5382e-05'protime'-0.0181'age'-0.0017
11NaNNaNNaNNaNNaNNaN'age'-0.0060'trig'-9.7533e-04
12NaNNaNNaNNaNNaNNaN'trig'-0.0016'platelet'3.7506e-04
13NaNNaNNaNNaNNaNNaN'trt'7.9540e-04'chol'1.9142e-04
14NaNNaNNaNNaNNaNNaN'platelet'3.3887e-04'alkphos'4.3230e-05
15NaNNaNNaNNaNNaNNaN'ast'-2.9267e-04NaNNaN
16NaNNaNNaNNaNNaNNaN'alkphos'4.6125e-05NaNNaN

Linear Analysis

linVarNames = ["R-Squared","Adjusted R-Squared","ncoeff","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
table(regressionModel.linear.rSquared',regressionModel.linear.rSquaredAdj',regressionModel.linear.ncoef',...
regressionModel.linear.absError',regressionModel.linear.corrR',regressionModel.linear.corrP',...
regressionModel.linear.rankCorrR',regressionModel.linear.rankCorrP',...
'VariableNames', linVarNames,'RowNames', rowNames)
ans = 5×8 table
 R-SquaredAdjusted R-SquaredncoeffAverage Absolute ErrorPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-val
1 Subsample 10.49580.4717100.46210.70111.7786e-090.52300.0000
2 Subsample 20.45160.4255100.52660.74835.1533e-110.60490.0000
3 Subsample 30.50430.4807100.43580.57514.3764e-060.54650.0000
4 Subsample 40.50610.4826100.50720.64121.3378e-070.52810.0000
5 Subsample 50.53410.5119100.44030.48691.6364e-040.42720.0013
maxRow = max(regressionModel.linear.ncoef);
T = table();
for i=1:NFolds
tempT = table([regressionModel.linear.coeffNames{i},num2cell(regressionModel.linear.coeffVals{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T2Pad = [tempT;repmat({[{NaN},{NaN}]},maxRow-size(tempT,1),1)];
T = [T T2Pad];
end
T
T = 11×5 table
 Subset 1Subset 2Subset 3Subset 4Subset 5
1'stage'-0.1503'edema'-0.9025'edema'-0.7777'copper'-0.0014'edema'-0.6937
2'ascites'-0.5725'stage'-0.1236'logpro'18.0918'stage'-0.1230'copper'-0.0011
3'edema'-0.4901'ascites'-0.4427'protime'-1.6187'edema'-0.4783'stage'-0.1046
4'logalb'9.2550'logbil'-0.1540'ascites'-0.4853'ascites'-0.3469'ascites'-0.3729
5'logbil'-0.1591'logpro'6.6627'logalb'5.8094'protime'-0.5744'protime'-0.7450
6'albumin'-2.3638'protime'-0.5577'stage'-0.1089'logalb'3.8982'logpro'8.3625
7'copper'-9.2122e-04'copper'-5.1500e-04'(Intercept)'-19.5510'bili'-0.0236'logbil'-0.1177
8'protime'-0.5301'logalb'2.5833'copper'-9.1400e-04'logpro'6.1645'bili'-0.0224
9'logpro'5.9515'albumin'-0.6051'logbil'-0.1387'logbil'-0.0908'(Intercept)'-5.4262
10'bili'-0.0106'(Intercept)'-2.8779'albumin'-1.4768'albumin'-0.7797'albumin'0.2574
11'(Intercept)'-3.5729'bili'-0.0045'bili'-0.0080'(Intercept)'-2.4954'logalb'0.6020

Stepwise Analysis

stepVarNames = ["R-Squared","Adjusted R-Squared","Number of Predictors","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
stepTables = cell(3,1);
for i=1:3
stepTables{i} = table(regressionModel.step(i).rSquared',regressionModel.step(i).rSquaredAdj',...
regressionModel.step(i).ncoef',regressionModel.step(i).absError',regressionModel.step(i).corrR',...
regressionModel.step(i).corrP',regressionModel.step(i).rankCorrR',regressionModel.step(i).rankCorrP',...
'VariableNames', stepVarNames,'RowNames', rowNames);
end

Stepwise Full

j = 1;
stepTables{j}
ans = 5×8 table
 R-SquaredAdjusted R-SquaredNumber of PredictorsAverage Absolute ErrorPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-val
1 Subsample 10.49540.478760.43350.72492.6534e-100.57154.2235e-06
2 Subsample 20.44810.435240.52530.71131.1526e-090.62845.6553e-07
3 Subsample 30.50340.487160.40810.68011.1264e-080.59083.3061e-06
4 Subsample 40.52520.507370.50060.62902.7076e-070.58224.8634e-06
5 Subsample 50.55060.533770.45160.48821.5630e-040.44337.8767e-04
maxRow = max(cellfun('length',regressionModel.step(j).coeffNames));
T = table();
for i=1:NFolds
tempT = table([regressionModel.step(j).coeffNames{i},num2cell(regressionModel.step(j).coeffVals{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T2Pad = [tempT;repmat({[{NaN},{NaN}]},maxRow-size(tempT,1),1)];
T = [T T2Pad];
end
T
T = 9×5 table
 Subset 1Subset 2Subset 3Subset 4Subset 5
1'(Intercept)'5.8974'(Intercept)'6.3724'(Intercept)'6.2523'(Intercept)'6.3346'(Intercept)'5.7779
2'logbil'-0.2326'edema_1'-1.3755'logbil'-0.2044'logbil'-0.2195'logbil'-0.2953
3'alkphos'5.9631e-05'logbil'-0.2369'edema_1'-0.8244'logalb'1.3140'albumin'0.4799
4'logalb'1.3461'alkphos'5.6260e-05'alkphos'6.3858e-05'alkphos'6.0539e-05'edema_1'-0.6797
5'ascites_1'-0.6639'edema_0.5'-0.3729'logalb'1.0773'copper'-0.0017'alkphos'6.5456e-05
6'edema_1'-0.5777'logalb'0.9083'ascites_1'-0.6136'edema_1'-0.6326'ascites_1'-0.4360
7'copper'-0.0012NaNNaN'copper'-0.0012'ascites_1'-0.4397'chol'4.3342e-04
8'edema_0.5'-0.2519NaNNaN'edema_0.5'-0.3204'age'-0.0077'copper'-0.0011
9NaNNaNNaNNaNNaNNaN'edema_0.5'-0.0529'edema_0.5'-0.1538

Stepwise Subset

j = 2;
stepTables{j}
ans = 5×8 table
 R-SquaredAdjusted R-SquaredNumber of PredictorsAverage Absolute ErrorPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-val
1 Subsample 10.46080.445660.41960.70180.00000.57560.0000
2 Subsample 20.42370.410350.54150.72280.00000.61410.0000
3 Subsample 30.46750.455150.43040.63240.00000.64300.0000
4 Subsample 40.45990.449940.53040.62650.00000.58620.0000
5 Subsample 50.51410.502850.49580.40150.00240.37030.0057
maxRow = max(cellfun('length',regressionModel.step(j).coeffNames));
T = table();
for i=1:NFolds
tempT = table([regressionModel.step(j).coeffNames{i},num2cell(regressionModel.step(j).coeffVals{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T2Pad = [tempT;repmat({[{NaN},{NaN}]},maxRow-size(tempT,1),1)];
T = [T T2Pad];
end
T
T = 7×5 table
 Subset 1Subset 2Subset 3Subset 4Subset 5
1'(Intercept)'5.9967'(Intercept)'6.5704'(Intercept)'6.2191'(Intercept)'5.7191'(Intercept)'5.3539
2'bili'-0.0477'edema'-0.9239'bili'-0.0486'bili'-0.0639'bili'-0.0787
3'albumin'0.4463'bili'-0.0400'edema'-0.8078'albumin'0.5189'albumin'0.5929
4'ascites'-0.7451'alkphos'4.8620e-05'albumin'0.3744'edema'-0.8402'edema'-0.7831
5'alkphos'4.8464e-05'albumin'0.2744'ascites'-0.6061'alkphos'4.9037e-05'alkphos'5.9723e-05
6'spiders'-0.2315'ascites'-0.5059'alkphos'5.3121e-05NaNNaN'chol'4.6171e-04
7'edema'-0.4829NaNNaNNaNNaNNaNNaNNaNNaN

Stepwise Subset Log

j = 3;
stepTables{j}
ans = 5×8 table
 R-SquaredAdjusted R-SquaredNumber of PredictorsAverage Absolute ErrorPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-val
1 Subsample 10.48500.473050.43330.70191.6733e-090.52942.7260e-05
2 Subsample 20.45220.439450.53340.73641.4849e-100.59892.2891e-06
3 Subsample 30.49050.478750.41540.66333.4301e-080.61581.0392e-06
4 Subsample 40.48210.470050.47990.70132.4687e-090.62885.5566e-07
5 Subsample 50.53650.523560.45670.49121.4004e-040.44048.5836e-04
maxRow = max(cellfun('length',regressionModel.step(j).coeffNames));
T = table();
for i=1:NFolds
tempT = table([regressionModel.step(j).coeffNames{i},num2cell(regressionModel.step(j).coeffVals{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T2Pad = [tempT;repmat({[{NaN},{NaN}]},maxRow-size(tempT,1),1)];
T = [T T2Pad];
end
T
T = 7×5 table
 Subset 1Subset 2Subset 3Subset 4Subset 5
1'(Intercept)'5.7683'(Intercept)'6.5915'(Intercept)'6.1085'(Intercept)'5.6419'(Intercept)'5.3857
2'logbil'-0.2823'logbil'-0.2273'logbil'-0.2484'logbil'-0.2781'logbil'-0.3488
3'logalb'1.3844'edema'-0.9090'edema'-0.7944'logalb'1.4854'logalb'1.6016
4'alkphos'5.5215e-05'alkphos'5.3089e-05'ascites'-0.6706'edema'-0.6196'edema'-0.5924
5'ascites'-0.6648'ascites'-0.5026'alkphos'5.8366e-05'alkphos'5.3362e-05'alkphos'5.9103e-05
6'edema'-0.5732'logalb'0.7486'logalb'1.1322'ascites'-0.4790'chol'4.8689e-04
7NaNNaNNaNNaNNaNNaNNaNNaN'ascites'-0.4558

Overall Averages Analysis

overSum = struct();
for i=1:3
overSum.step(1,i) = mean(regressionModel.step(i).corrR);
overSum.step(2,i) = mean(regressionModel.step(i).rankCorrR);
overSum.step(3,i) = mean(regressionModel.step(i).absError);
overSum.step(4,i) = mean(regressionModel.step(i).ncoef);
end
overSum.linear = [mean(regressionModel.linear.corrR)
mean(regressionModel.linear.rankCorrR)
mean(regressionModel.linear.absError)
mean(regressionModel.linear.ncoef)];
overSum.lasso = [mean(regressionModel.lasso.corrR)
mean(regressionModel.lasso.rankCorrR)
mean(regressionModel.lasso.absError)
mean(regressionModel.lasso.ncoef)];
sumVarNames = ["Stepwise Full", "Stepwise Subset", "Stepwise Subset Log", "Linear FitLm", "Lasso"];
sumRowNames = ["Pearsons Correlation","Rank Correlation","Mean Absolute Error","Number of Coefficients"];
table(overSum.step(:,1),overSum.step(:,2),overSum.step(:,3),overSum.linear,overSum.lasso,...
'VariableNames', sumVarNames,'RowNames', sumRowNames)
ans = 4×5 table
 Stepwise FullStepwise SubsetStepwise Subset LogLinear FitLmLasso
1 Pearsons Correlation0.64670.61700.65880.63050.6470
2 Rank Correlation0.56320.55780.56270.52590.5549
3 Mean Absolute Error0.46380.48350.46370.47440.4685
4 Number of Coefficients6.00005.00005.200010.000011.4000

Method Comparision Plots

figure();
subplot(1,2,1)
X = categorical({'Lasso','Stepwise Full','Stepwise Subset','Stepwise Subset Log','Linear'});
bar(X,[overSum.lasso(1),overSum.step(1,:),overSum.linear(1)])
title('Correlations for Each Regression Model')
xlabel('Regression Model')
ylabel("Pearson's r")
subplot(1,2,2)
bar(X,[overSum.lasso(4),overSum.step(4,:),overSum.linear(4)])
title('Features Used for Each Regression Model')
xlabel('Regression Model')
ylabel("Features")

Task 6 - Supervised Learning on Status

Logistic Regression

% Number of features to select
NFeatures = 8;
cirrhosis_status_array = table2array(cirrhosis_status);
accuracyGuess = zeros(100,NFolds);
meanGuess = zeros(NFolds,1);
sigDiff = zeros(NFolds,2);
for i=1:NFolds
test = (indices == i);
train = ~test;
% Extract train rows from X and Y
Xtrain = cirrhosis_status_array(train,1:end-1);
Ytrain = cirrhosis_status_array(train,end);
% Extract test rows from X and Y
Xtest = cirrhosis_status_array(test,1:end-1);
Ytest = cirrhosis_status_array(test,end);
% Create logistic model with train data.
[b,dev,stats] = mnrfit(Xtrain, categorical(Ytrain));
[p,I] = sort(stats.p(2:end,:));
[p_min,min_idx] = min(p,[],2);
regressionModel.logist.bestP{i} = p_min(1:NFeatures);
regressionModel.logist.bestFeatures{i} = diag(I(1:NFeatures,min_idx(1:NFeatures)));
regressionModel.logist.bestFeaturesNames{i} = cirrhosis_status_names(regressionModel.logist.bestFeatures{i})';
regressionModel.logist.bestFeaturesCoeff{i} = b(I(1:NFeatures));
% Predict Y values from model.
Ypred = mnrval(b,Xtest);
[~,Ypred] = max(Ypred,[],2);
Ypred = double(Ypred - 1);
%Comparing model to 100 random guesses
for k = 1:100
random_pred = Ytrain(randperm(length(Ytest)));
accuracyGuess(k,i) = sum(random_pred == Ytest)/length(Ytest);
end
meanGuess(i) = mean(accuracyGuess(:,i));
accuracy = sum(Ypred == Ytest)/length(Ypred);
%T test comparing model accuracy to random guess accuracy
[sigDiff(i,1),sigDiff(i,2)] = ttest2(accuracy,accuracyGuess(:,i));
meanGuessError = meanGuess(i) - accuracy;
% Evaluate Model
regressionModel.logist.confusion{i} = confusionmat(Ytest,Ypred);
regressionModel.logist.accuracy(i) = accuracy;
regressionModel.logist.precision(i) = sum(Ypred ~=0 & Ypred==Ytest)/sum(Ypred~=0);
regressionModel.logist.recall(i) = sum(Ypred ~=0 & Ytest ==Ypred)/sum(Ytest~=0);
regressionModel.logist.absError(i) = mean(abs(Ypred - Ytest));
[regressionModel.logist.corrR(i),regressionModel.logist.corrP(i)] = corr(Ypred,Ytest);
[regressionModel.logist.rankCorrR(i),regressionModel.logist.rankCorrP(i)] = corr(Ypred,Ytest,'Type','Spearman');
regressionModel.logist.meanGuessError(i) = meanGuessError;
regressionModel.logist.compareGuess(i) = sigDiff(i,2);
end
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 2.164030e-16.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 7.961029e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 6.517939e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 5.336438e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 4.369106e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.577122e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.436861e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.302100e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.172622e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.147343e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.122264e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.117273e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.112289e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.107313e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.102346e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.101353e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.101155e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100956e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100758e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100559e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100520e-17.
Warning: Matrix is close to singular or badly scaled. Results may be inaccurate. RCOND = 3.100520e-17.

Logistic Regression Analysis

logVarNames = ["Accuracy","Precision","Recall","Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val","Mean Guess Error","Guess Difference"];
table(regressionModel.logist.accuracy',regressionModel.logist.precision',regressionModel.logist.recall',...
regressionModel.logist.absError',regressionModel.logist.corrR',regressionModel.logist.corrP',...
regressionModel.logist.rankCorrR',regressionModel.logist.rankCorrP', regressionModel.logist.meanGuessError',...
regressionModel.logist.compareGuess','VariableNames',logVarNames,'RowNames', rowNames)
ans = 5×10 table
 AccuracyPrecisionRecallAverage Absolute ErrorPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-valMean Guess ErrorGuess Difference
1 Subsample 10.71430.68180.62500.28570.41160.00160.41160.0016-0.24140.0005
2 Subsample 20.76360.72730.69570.23640.51160.00010.51160.0001-0.28290.0000
3 Subsample 30.81820.68420.76470.18180.58970.00000.58970.0000-0.41200.0000
4 Subsample 40.74550.71430.65220.25450.47180.00030.47180.0003-0.26980.0000
5 Subsample 50.70910.75000.50000.29090.40510.00220.40510.0022-0.23160.0012
f = figure();
set(f, 'Position', expandFigSize)
for i=1:NFolds
subplot(ceil(NFolds/2),floor(NFolds/2),i)
cm = confusionchart(regressionModel.logist.confusion{i});
cm.Title = sprintf("Subsample %d",i);
end
sgtitle("Status Classification Using Logistic Regression")
T = table();
for i=1:NFolds
tempT = table([regressionModel.logist.bestFeaturesNames{i},...
num2cell(regressionModel.logist.bestFeaturesCoeff{i})],...
'VariableNames',cellstr(sprintf("Subset %i",i)));
T = [T tempT];
end
T
T = 8×5 table
 Subset 1Subset 2Subset 3Subset 4Subset 5
1'age'88.8792'alkphos'-7.2926'age'22.5655'alkphos'-6.7341'age'113.5512
2'alkphos'-3.9411'age'79.1033'alkphos'-6.3837'age'58.3636'logpro'-0.5337
3'logpro'-0.5764'logpro'-0.5537'ast'0.2563'copper'-0.0025'protime'-0.0016
4'ast'0.5772'ast'0.7510'spiders'-0.0061'sex'2.8046'sex'5.4002
5'protime'-0.0020'trig'0.1450'albumin'-0.0660'logbil'24.4058'trig'0.1485
6'logbil'13.6599'protime'0.0013'trig'0.1241'logpro'-0.6428'alkphos'-6.4620
7'copper'-2.9115e-05'sex'3.6249'logalb'-0.0027'stage'-0.4353'hepato'-0.3488
8'stage'0.1487'logbil'24.6619'copper'-0.0041'trt'-0.2967'logbil'24.0589

Task 7 - Random Forest Model

Tree Bagger Time Regression

Used cirrhosis_log again because it's regression.
for i=1:NFolds
test = (indices == i);
train = ~test;
% Extract train rows from X and Y
trainData = cirrhosis_log(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_log(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = table2array(testData(:,end));
% Create linear model with train data.
rfMdlTime = TreeBagger(100,Xtrain,Ytrain,'Method','regression','OOBPrediction','On','OOBPredictorImportance','on');
Ypred = rfMdlTime.predict(Xtest);
% Evaluate the model
regressionModel.rfMdlTime.fit{i} = rfMdlTime;
regressionModel.rfMdlTime.absError(i) = mean(abs(Ypred-Ytest));
[regressionModel.rfMdlTime.corrR(i),regressionModel.rfMdlTime.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.rfMdlTime.rankCorrR(i),regressionModel.rfMdlTime.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
end

Tree Bagger Time Regression Analysis

f = figure();
set(f, 'Position', expandFigSize)
for i=1:NFolds
subplot(ceil(NFolds/2),floor(NFolds/2),i)
bar(regressionModel.rfMdlTime.fit{i}.OOBPermutedPredictorDeltaError)
xlabel('Feature Index')
ylabel('Out-of-Bag Feature Importance')
title(sprintf('Subsample %i',i))
xticks(1:length(cirrhosis_log_names)-1)
xticklabels(cirrhosis_log_names(1:end-1))
xtickangle(45)
end
sgtitle('Out-of-Bag Feature Imp for Time Prediction')
rfVarNames = ["Average Absolute Error","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val"];
table(regressionModel.rfMdlTime.absError',regressionModel.rfMdlTime.corrR',...
regressionModel.rfMdlTime.corrP',regressionModel.rfMdlTime.rankCorrR',regressionModel.rfMdlTime.rankCorrP',...
'VariableNames', rfVarNames,'RowNames', rowNames)
ans = 5×5 table
 Average Absolute ErrorPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-val
1 Subsample 10.43670.68480.00000.56540.0000
2 Subsample 20.49870.83680.00000.70930.0000
3 Subsample 30.36320.76690.00000.62460.0000
4 Subsample 40.52340.58870.00000.54700.0000
5 Subsample 50.45350.39260.00300.36780.0060

Tree Bagger Status Classification

accuracyGuess = zeros(100,NFolds);
meanGuess = zeros(NFolds,1);
sigDiff = zeros(NFolds,2);
for i=1:NFolds
test = (indices == i);
train = ~test;
% Extract train rows from X and Y
trainData = cirrhosis_status(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_status(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = double(table2array(testData(:,end)));
% Create linear model with train data.
rfMdlStatus = TreeBagger(100,Xtrain,Ytrain,'OOBPrediction','On','OOBPredictorImportance','on');
Ypred = double(categorical(rfMdlStatus.predict(Xtest))) - 1;
%Comparing model to 100 random guesses
for k = 1:100
random_pred = Ytrain(randperm(length(Ytest)));
accuracyGuess(k,i) = sum(random_pred == Ytest)/length(Ytest);
end
meanGuess(i) = mean(accuracyGuess(:,i));
accuracy = sum(Ypred == Ytest)/length(Ypred);
%T test comparing model accuracy to random guess accuracy
[sigDiff(i,1),sigDiff(i,2)] = ttest2(accuracy,accuracyGuess(:,i));
meanGuessError = meanGuess(i) - accuracy;
% Evaluate the model
regressionModel.rfMdlStatus.fit{i} = rfMdlStatus;
regressionModel.rfMdlStatus.confusion{i} = confusionmat(Ytest,Ypred);
regressionModel.rfMdlStatus.absError(i) = mean(abs(Ypred-Ytest));
regressionModel.rfMdlStatus.accuracy(i) = accuracy;
[regressionModel.rfMdlStatus.corrR(i),regressionModel.rfMdlStatus.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.rfMdlStatus.rankCorrR(i),regressionModel.rfMdlStatus.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
regressionModel.rfMdlStatus.meanGuessError(i) = meanGuessError;
regressionModel.rfMdlStatus.compareGuess(i) = sigDiff(i,2);
end

Tree Bagger Status Classification Analysis

f = figure();
set(f, 'Position', expandFigSize)
for i=1:NFolds
subplot(ceil(NFolds/2),floor(NFolds/2),i)
bar(regressionModel.rfMdlStatus.fit{i}.OOBPermutedPredictorDeltaError)
xlabel('Feature Index')
ylabel('Out-of-Bag Feature Importance')
title(sprintf('Subsample %i',i))
xticks(1:length(cirrhosis_status_names)-1)
xticklabels(cirrhosis_status_names(1:end-1))
xtickangle(45)
end
sgtitle('Out-of-Bag Feature Importance for Time Prediction')
rfClassVarNames = ["Average Absolute Error","Accuracy","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val","Mean Guess Error","Guess Difference"];
table(regressionModel.rfMdlStatus.absError',regressionModel.rfMdlStatus.accuracy',regressionModel.rfMdlStatus.corrR',...
regressionModel.rfMdlStatus.corrP',regressionModel.rfMdlStatus.rankCorrR',regressionModel.rfMdlStatus.rankCorrP',...
regressionModel.rfMdlStatus.meanGuessError',regressionModel.rfMdlStatus.compareGuess', ...
'VariableNames',rfClassVarNames,'RowNames', rowNames)
ans = 5×8 table
 Average Absolute ErrorAccuracyPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-valMean Guess ErrorGuess Difference
1 Subsample 10.21430.78570.55940.00000.55940.0000-0.30180.0000
2 Subsample 20.20000.80000.60000.00000.60000.0000-0.33350.0000
3 Subsample 30.14550.85450.65050.00000.65050.0000-0.43890.0000
4 Subsample 40.29090.70910.40220.00230.40220.0023-0.24650.0011
5 Subsample 50.25450.74550.49580.00010.49580.0001-0.26070.0000
f = figure();
set(f, 'Position', expandFigSize)
for i=1:NFolds
subplot(ceil(NFolds/2),floor(NFolds/2),i)
cm = confusionchart(regressionModel.rfMdlStatus.confusion{i});
cm.Title = sprintf("Subsample %d",i);
end
sgtitle("Status Classification Using Random Forest")

Task 8 - New Machine Learning Algorithm: Support Vector Machiine

For predicting status - binary classification
for i=1:NFolds
test = (indices == i);
train = ~test;
% Extract train rows from X and Y
trainData = cirrhosis_status(train,:);
Xtrain = table2array(trainData(:,1:end-1));
Ytrain = table2array(trainData(:,end));
% Extract test rows from X and Y
testData = cirrhosis_status(test,:);
Xtest = table2array(testData(:,1:end-1));
Ytest = double(table2array(testData(:,end)));
svmMdlStatus = fitcsvm(cirrhosis_status,'status');
Ypred = double(categorical(svmMdlStatus.predict(Xtest))) - 1;
%Comparing model to 100 random guesses
for k = 1:100
random_pred = Ytrain(randperm(length(Ytest)));
accuracyGuess(k,i) = sum(random_pred == Ytest)/length(Ytest);
end
meanGuess(i) = mean(accuracyGuess(:,i));
accuracy = sum(Ypred == Ytest)/length(Ypred);
%T test comparing model accuracy to random guess accuracy
[sigDiff(i,1),sigDiff(i,2)] = ttest2(accuracy,accuracyGuess(:,i));
meanGuessError = meanGuess(i) - accuracy;
% Evaluate the model
regressionModel.svmMdlStatus.fit{i} = svmMdlStatus;
regressionModel.svmMdlStatus.confusion{i} = confusionmat(Ytest,Ypred);
regressionModel.svmMdlStatus.absError(i) = mean(abs(Ypred-Ytest));
regressionModel.svmMdlStatus.accuracy(i) = accuracy;
[regressionModel.svmMdlStatus.corrR(i),regressionModel.svmMdlStatus.corrP(i)] = corr(Ytest,Ypred);
[regressionModel.svmMdlStatus.rankCorrR(i),regressionModel.svmMdlStatus.rankCorrP(i)] = corr(Ytest,Ypred,'Type','Spearman');
regressionModel.svmMdlStatus.meanGuessError(i) = meanGuessError;
regressionModel.svmMdlStatus.compareGuess(i) = sigDiff(i,2);
end
svmClassVarNames = ["Average Absolute Error","Accuracy","Pearsons Correlation",...
"Pearsons Correlation P-val","Rank Correlation", "Rank Correlation P-val","Mean Guess Error","Guess Difference"];
table(regressionModel.svmMdlStatus.absError',regressionModel.svmMdlStatus.accuracy',regressionModel.svmMdlStatus.corrR',...
regressionModel.svmMdlStatus.corrP',regressionModel.svmMdlStatus.rankCorrR',regressionModel.svmMdlStatus.rankCorrP',...
regressionModel.svmMdlStatus.meanGuessError',regressionModel.svmMdlStatus.compareGuess', ...
'VariableNames',svmClassVarNames,'RowNames', rowNames)
ans = 5×8 table
 Average Absolute ErrorAccuracyPearsons CorrelationPearsons Correlation P-valRank CorrelationRank Correlation P-valMean Guess ErrorGuess Difference
1 Subsample 10.25000.75000.48410.00020.48410.0002-0.28141.4628e-06
2 Subsample 20.21820.78180.55680.00000.55680.0000-0.31131.9432e-06
3 Subsample 30.23640.76360.42200.00130.42200.0013-0.35562.1356e-10
4 Subsample 40.27270.72730.43090.00100.43090.0010-0.25271.7325e-04
5 Subsample 50.23640.76360.52220.00000.52220.0000-0.27351.1850e-04
f = figure();
set(f, 'Position', expandFigSize)
for i=1:NFolds
subplot(ceil(NFolds/2),floor(NFolds/2),i)
cm = confusionchart(regressionModel.svmMdlStatus.confusion{i});
cm.Title = sprintf("Subsample %d",i);
end
sgtitle("Status Classification Using Support Vector Machine")

Task 9 - Status Logistic vs Random Forest vs SVM

compVarNames = ["Logistic","Random Forest", "Support Vector Machine (SVM)"];
compRowNames = ["Pearsons Correlation","Mean Absolute Error","Accuracy"];
compData = [
mean(regressionModel.logist.corrR), mean(regressionModel.rfMdlStatus.corrR), mean(regressionModel.svmMdlStatus.corrR)
mean(regressionModel.logist.absError), mean(regressionModel.rfMdlStatus.absError), mean(regressionModel.svmMdlStatus.absError)
mean(regressionModel.logist.accuracy), mean(regressionModel.rfMdlStatus.accuracy), mean(regressionModel.svmMdlStatus.accuracy)
];
table(compData(:,1),compData(:,2), compData(:,3),...
'VariableNames', compVarNames,'RowNames', compRowNames)
ans = 3×3 table
 LogisticRandom ForestSupport Vector Machine (SVM)
1 Pearsons Correlation0.47800.54160.4832
2 Mean Absolute Error0.24990.22100.2427
3 Accuracy0.75010.77900.7573
f = figure();
set(f, 'Position', [0,0,1000,500])
subplot(1,3,1)
confusionchart(round(mean(cat(3,regressionModel.rfMdlStatus.confusion{:}),3)));
title("Random Forest Status Classification")
subplot(1,3,2)
confusionchart(round(mean(cat(3,regressionModel.logist.confusion{:}),3)));
title("Logistic Regression Status Classification")
subplot(1,3,3)
confusionchart(round(mean(cat(3,regressionModel.svmMdlStatus.confusion{:}),3)));
title("Support Vector Machine Status Classification")